home *** CD-ROM | disk | FTP | other *** search
- Newsgroups: comp.sources.misc
- keywords: sort
- subject: v08i092: fast & simple sorting program
- From: allbery@uunet.UU.NET (Brandon S. Allbery - comp.sources.misc)
- Reply-To: ok@cs.mu.oz.au.UUCP (Richard O'Keefe)
-
- Posting-number: Volume 8, Issue 92
- Submitted-by: ok@cs.mu.oz.au.UUCP (Richard O'Keefe)
- Archive-name: nsort
-
- #!/bin/sh
- # This is a shell archive, meaning:
- # 1. Remove everything above the #!/bin/sh line.
- # 2. Save the resulting text in a file.
- # 3. Execute the file with /bin/sh (not csh) to create the files:
- # README
- # makefile
- # nsort.1
- # nsort.c
- cat >README <<'------ EOF ------'
- nsort is a totally stripped down sort(1); it reads a file into memory,
- sorts it using merge sort, and writes the result out. I wrote it to
- show that merge sort is fast -- sort(1) is documented as using a
- version of quicksort internally, and nsort beats it easily even when
- sort(1) is given enough memory to sort in memory.
- To compile it, just run make(1).
- ------ EOF ------
- ls -l README
- cat >makefile <<'------ EOF ------'
- nsort:
- cc -o nsort -O nsort.c
- ------ EOF ------
- ls -l makefile
- cat >nsort.1 <<'------ EOF ------'
- .TH NSORT 1
- .\"/*%nroff -man %
- .SH NAME
- nsort \*- fast and simple sorting program
- .SH SYNOPSIS
- .B nsort
- <in >out
- .SH DESCRIPTION
- The
- .B nsort
- command
- reads lines from its standard input, sorts them, and writes the
- sorted lines to its standard output. It behaves just like
- sort(1) when the latter is given no arguments. The only reason
- for using
- .B nsort
- is speed: when it can be used it is up to twice as fast as sort(1).
- .PP
- This program does
- .I not
- accept file names as arguments. To process more than
- one input file, pipe the output of cat(1) into
- .B nsort.
- .PP
- The program does not need to know how many lines there
- are in the standard input, nor does it need to read the
- standard input more than once. However, the data must
- fit into memory.
- .SH OPTIONS
- The
- .B nsort
- command has no options.
- .SH EXAMPLES
- .IP % 3
- ls -f . | nsort
- .PP
- has the same effect as a plain `ls'.
- .SH DIAGNOSTICS
- An error message is printed on stderr and the program halts without
- producing any output if
- .PP
- there are any arguments on the command line (status EX_USAGE is
- returned in this case), or
- .PP
- the program is unable to obtain enough memory using malloc (if the
- file has N lines and C characters, roughly 8N+C bytes are needed)
- (status EX_SOFTWARE is returned in this case), or
- .PP
- something goes wrong while reading standard input
- (status EX_IOERR is returned in this case).
- .PP
- All error messages indicate the actual name the program was invoked by.
- If an error message is produced, the rest of the standard input is not
- read, so you are likely to get "Broken pipe" messages from upstream
- processes.
- .SH BUGS
- Input lines which contain NUL characters are quietly truncated.
- sort(1) cannot handle such lines either, but complains.
- .SH "SEE ALSO"
- sort(1)
- .SH AUTHOR
- Richard A. O'Keefe
- ------ EOF ------
- ls -l nsort.1
- cat >nsort.c <<'------ EOF ------'
- /* File : nsort.c
- Author : Richard A. O'Keefe
- Updated: 1988
- Purpose: Fast sorting command for smallish files.
-
- This program has no copyright notice. You can do anything you please
- with it, except blame me if it doesn't work.
- */
-
- #ifndef lint
- static char SCCSid[] = "%Z%%E% %M% %I%";
- #endif/*lint*/
-
- /* nsort <input >output
-
- is a very simple sorting program which has been written to be as fast
- as possible. It is equivalent to sort with no arguments. That is,
- it sorts its standard input to its standard output, and compares
- entire lines.
-
- It uses a natural merge, so if the input is already in order it takes
- linear time, and it reads the entire file into memory, using a single
- read() if the standard input is a regular file.
-
- Sorting random permutations of /usr/dict/words on a Sun-3/50:
- sort(1) : 15 seconds nsort : 8 seconds (with SCSI disc)
- 17 seconds 11 seconds (NFS over Ethernet)
- */
-
- #include <stdio.h>
-
- /* The following values are taken from <sysexits.h>, but are copied here
- in case you lack that BSD header file.
- */
- #define EX_OK 0 /* nothing went wrong */
- #define EX_USAGE 64 /* something wrong with the command line */
- #define EX_SOFTWARE 70 /* internal error (here, memory ran out) */
- #define EX_IOERR 71 /* transput error (here, read() trouble) */
-
- #define STDIN 0
-
- extern char * malloc();
- extern char * strcpy();
- extern int strcmp();
- extern int strlen();
- extern void perror();
- extern int lseek();
- extern int read();
-
- typedef struct item_rec *item_ptr;
-
- struct item_rec
- {
- item_ptr next;
- char* data;
- };
-
- #define IN_ORDER(x, y) (strcmp((x)->data, (y)->data) <= 0)
-
-
- item_ptr nat_sort(list)
- item_ptr list;
- {
- item_ptr stack[30];
- item_ptr *sp = stack;
- int runs = 0; /* total number of runs processed */
- register item_ptr p, q, r;
- struct item_rec header;
- int k;
-
- while (p = list) {
- /* pick up a run from the front of list, setting */
- /* p = (pointer to beginning of run), list = (rest of list) */
- if (q = p->next) {
- list = q->next;
- if (!IN_ORDER(p, q)) r = q, q = p, p = r;
- p->next = q;
- for (r = list; r && IN_ORDER(q, r); r = r->next)
- q->next = r, q = r;
- q->next = (item_ptr)0;
- list = r;
- } else {
- list = (item_ptr)0;
- }
- runs++;
- /* merge this run with 0 or more runs off the top of the stack */
- for (k = runs; 1 &~ k; k >>= 1) {
- q = *--sp;
- r = &header;
- while (q && p)
- if (IN_ORDER(q, p)) {
- r->next = q;
- r = q, q = q->next;
- } else {
- r->next = p;
- r = p, p = p->next;
- }
- r->next = q ? q : p;
- p = header.next;
- }
- /* push the merged run onto the stack */
- *sp++ = p;
- }
- if (sp == stack) return (item_ptr)0;
- /* merge all the runs on the stack */
- p = *--sp;
- while (sp != stack) {
- q = *--sp;
- r = &header;
- while (q && p)
- if (IN_ORDER(q, p)) {
- r->next = q;
- r = q, q = q->next;
- } else {
- r->next = p;
- r = p, p = p->next;
- }
- r->next = q ? q : p;
- p = header.next;
- }
- return p;
- }
-
- item_ptr alloc_item(data)
- char *data;
- {
- register item_ptr result;
-
- result = (item_ptr)malloc(sizeof *result + strlen(data) + 1);
- if (result) {
- result->next = (item_ptr)0;
- result->data = strcpy((char*)result + sizeof *result, data);
- }
- return result;
- }
-
- /* What we really want to do is to slurp the entire file in with one call
- to read(). In order to do that, we need to know how much there is. A
- file's size can be determined either by calling fstat() or by using an
- lseek() to the end. The number you get from fstat() doesn't mean much
- for pipes and terminals, so lseek() appears to be the better approach.
- Note that even when stdin is connected to a file, part of it may have
- been read already. In that case, we should not include the part which
- has been read. So we do
- i = lseek(STDIN, 0, 1)
- to discover the current position (and simultaneously to find out if we
- can use lseek() at all on this file descriptor). One problem remains;
- the size of the file may change while we are reading it. In that case
- we'll stop early if we have to, but if the file grows we'll miss stuff
- added at the end.
- */
- main(argc, argv)
- int argc;
- char **argv;
- {
- struct item_rec header;
- item_ptr p, q;
- char *progname = argc >= 1 ? argv[0] : "nsort";
- int i;
-
- if (argc != 1) {
- fprintf(stderr, "Usage: %s <unordered >sorted\n", progname);
- exit(EX_USAGE);
- }
- i = lseek(STDIN, 0, 1); /* current position in stream */
- if (i < 0) { /* can't find out the size by seeking */
- char buffer[1024];
-
- for (p = &header
- ; fgets(buffer, sizeof buffer, stdin)
- ; p->next = q, p = q) {
- i = strlen(buffer);
- if (i > 0 && buffer[i-1] == '\n') buffer[i-1] = '\0';
- if (!(q = alloc_item(buffer))) {
- fprintf(stderr, "%s: ran out of memory\n", progname);
- exit(EX_SOFTWARE);
- }
- } else {
- register char *b, *c;
- register int n;
-
- n = lseek(STDIN, 0, 2) - i; /* part of stdin may have been read */
- (void) lseek(STDIN, i, 0); /* go back to original position */
- if (!(b = malloc(n+1))) {
- fprintf(stderr, "%s: ran out of memory\n", progname);
- exit(EX_SOFTWARE);
- }
- for (c = b; (i = n-(c-b)) > 0; c += i) {
- i = read(STDIN, c, i);
- if (i < 0) {
- perror(progname);
- exit(EX_IOERR);
- } else
- if (i == 0) {
- break;
- }
- }
- /* it is possible that the file may have been truncated */
- /* while we were reading it. */
- n = c-b;
- for (p = &header; n > 0; b = c, p->next = q, p = q) {
- for (c = b; --n >= 0; c++)
- if (*c == '\n') {
- *c++ = '\0';
- break;
- }
- if (n < 0) *c = '\0';
- if (!(q = (item_ptr) malloc(sizeof *q))) {
- fprintf(stderr, "nsort: ran out of memory\n");
- exit(EX_SOFTWARE);
- }
- q->data = b;
- }
- }
- p->next = (item_ptr)0;
- p = header.next;
- for (p = nat_sort(p); p; p = p->next)
- puts(p->data);
- exit(EX_OK);
- }
-
- ------ EOF ------
- ls -l nsort.c
- exit 0
-
-